Set up
Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(Seurat)
## Attaching SeuratObject
library(clusterProfiler)
##
## clusterProfiler v4.4.4 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
##
## The following object is masked from 'package:purrr':
##
## simplify
##
## The following object is masked from 'package:stats':
##
## filter
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:clusterProfiler':
##
## rename
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:clusterProfiler':
##
## slice
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
##
## The following object is masked from 'package:grDevices':
##
## windows
##
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:clusterProfiler':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
set seed
set.seed(6)
load s3_celltypes.rds
so_merge <- readRDS('analysis_files/s4_so_merge.rds')
Create cluster / celltype dictionary
# Colors for clusters
color_dict <- so_merge@meta.data %>%
dplyr::select(lineage, cluster_l2, celltype_full) %>%
distinct() %>%
arrange(lineage, cluster_l2)
color_dict$color <- scales::hue_pal()(nrow(color_dict))
color_dict <- color_dict %>%
arrange(cluster_l2)
# Colors for lineage
lineage_color_dict <- so_merge@meta.data %>%
dplyr::select(lineage) %>%
distinct() %>%
arrange(lineage)
lineage_color_dict$color <- scales::hue_pal()(nrow(lineage_color_dict))
lineage_cols <- lineage_color_dict$color
names(lineage_cols) <- lineage_color_dict$lineage
# Colors for species
species_cols <- RColorBrewer::brewer.pal(n = 3, name = 'Set1')[1:2]
names(species_cols) <- c('mouse', 'human')
Figure 6
Figure 6A - lineage & cluster umap
1200 x 600 pixels
p1 <- DimPlot(so_merge,
group.by = 'lineage',
label = FALSE,
repel = TRUE,
cols = lineage_color_dict$color)+
coord_equal()+
ggtitle('Lineage')+
theme(plot.title = element_text(size = 25, face = 'bold'),
legend.text = element_text(face = 'bold'))+
guides(color = guide_legend(override.aes = list(size = 7)))
p2 <- DimPlot(so_merge,
group.by = 'seurat_clusters',
label = TRUE,
label.size = 6,
repel = FALSE,
cols = color_dict$color)+
coord_equal()+
ggtitle('Unsupervised Clusters')+
theme(plot.title = element_text(size = 25, face = 'bold'),
legend.position = 'none')
p2+p1
Figure 6B - Tumor-celltype distribution
# Pull metadata and construct tumor<->cellfrac_type dictionary
so_merge_meta <- so_merge@meta.data
tumor_cellfrac <- so_merge_meta %>%
dplyr::select(tumor, cellfrac_type) %>%
distinct()
# compute lineage proportions & join with tumor<->cellfrac_type dictionary
props <- so_merge_meta %>%
dplyr::select(lineage, tumor) %>%
table() %>%
prop.table(., margin = 2)
prop_gather <- as_tibble(props)
prop_gather <- full_join(x = prop_gather,
y = tumor_cellfrac,
by = 'tumor',
suffix = c('', ''))
f6b <- ggplot(prop_gather, aes(x = tumor, y = n, fill = lineage))+
geom_col()+
theme_bw()+
facet_grid(~cellfrac_type, scales = 'free_x', space = 'free_x')+
ylab('Fraction of tumor')+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("fig6b.png", f6b, height = 3, width = 4.5, dpi = 500)
Figure 6C - Distribution of unsupervised clusters (cell types)
1080 x 540
so_meta <- so_merge@meta.data
so_meta_tumors <- so_meta %>%
dplyr::select(celltype_full, cellfrac_type, lineage, tumor) %>%
group_by(tumor, cellfrac_type, celltype_full, lineage) %>%
summarize(n_cells = n()) %>%
group_by(tumor) %>%
mutate(freq = n_cells/sum(n_cells))
## `summarise()` has grouped output by 'tumor', 'cellfrac_type', 'celltype_full'.
## You can override using the `.groups` argument.
so_meta_tumors_grouped <- so_meta_tumors %>%
group_by(cellfrac_type, celltype_full) %>%
mutate(mean_freq = mean(freq)) %>%
mutate(freq_error_ymin = mean(freq) - sd(freq)) %>%
mutate(freq_error_ymax = mean(freq) + sd(freq))
so_meta_tumors_summary <- so_meta_tumors %>%
group_by(cellfrac_type, celltype_full) %>%
summarize(tumor_count = n(),
mean_freq = mean(freq),
freq_sd_ymin = mean(freq) - sd(freq),
freq_sd_ymax = mean(freq) + sd(freq),
freq_sem_ymin = mean(freq) - sd(freq)/(tumor_count)^0.5,
freq_sem_ymax = mean(freq) + sd(freq)/(tumor_count)^0.5,
lineage = lineage)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## `summarise()` has grouped output by 'cellfrac_type', 'celltype_full'. You can
## override using the `.groups` argument.
# mean freq of tumor
ggplot(so_meta_tumors_summary, aes(y = mean_freq, x = celltype_full, fill = cellfrac_type))+
geom_col(position = 'dodge')+
geom_point(data = so_meta_tumors, aes(x = celltype_full, y = freq), color = 'gray50', size = 1.5)+
theme_bw()+
geom_errorbar(aes(x = celltype_full, ymin = freq_sem_ymin, ymax = freq_sem_ymax), position = 'dodge', width = 0.5)+
facet_grid(rows = vars(cellfrac_type), cols = vars(lineage), scales = 'free_x', space = 'free_x')+
theme(strip.text.x = element_text(angle = 90))+
ylab('Mean frequency \n (fraction of tumor)')+
xlab('Celltype \n (cluster-celltype)')+
RotatedAxis()
Fig 6D: UMAP by lineage
## epi
so_epi <- subset(so_merge, subset = lineage == 'epithelial')
so_epi <- RunUMAP(so_epi, reduction = 'iNMF', dims = 1:50)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:17:01 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:01 Read 2824 rows and found 50 numeric columns
## 17:17:01 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:01 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:01 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c17bb2fa9
## 17:17:01 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:02 Annoy recall = 100%
## 17:17:02 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:03 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:03 Commencing optimization for 500 epochs, with 121422 positive edges
## 17:17:11 Optimization finished
DimPlot(so_epi,
group.by = 'celltype_full',
label = TRUE,
label.size = 5)+
coord_equal()+
ggtitle('Epithelial')
VlnPlot(so_epi,
features = c('Myc', 'Pten'))
## fibro
so_fibro <- subset(so_merge, subset = lineage == 'fibroblast')
so_fibro <- RunUMAP(so_fibro, reduction = 'iNMF', dims = 1:50)
## 17:17:13 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:13 Read 1533 rows and found 50 numeric columns
## 17:17:13 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:13 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:13 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c32323dd9
## 17:17:13 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:13 Annoy recall = 100%
## 17:17:14 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:15 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:15 Commencing optimization for 500 epochs, with 61640 positive edges
## 17:17:19 Optimization finished
DimPlot(so_fibro,
group.by = 'celltype_full',
label = TRUE,
label.size = 5)+
coord_equal()+
ggtitle('Fibroblast')
## lym
so_lym <- subset(so_merge, subset = lineage == 'lymphoid')
so_lym <- RunUMAP(so_lym, reduction = 'iNMF', dims = 1:50)
## 17:17:21 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:21 Read 3414 rows and found 50 numeric columns
## 17:17:21 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:21 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:22 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c59355c01
## 17:17:22 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:23 Annoy recall = 100%
## 17:17:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:24 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:24 Commencing optimization for 500 epochs, with 163904 positive edges
## 17:17:34 Optimization finished
DimPlot(so_lym,
group.by = 'celltype_full',
label = TRUE,
label.size = 5)+
coord_equal()+
ggtitle('Lymphoid')
## mye
so_mye <- subset(so_merge, subset = lineage == 'myeloid')
so_mye <- RunUMAP(so_mye, reduction = 'iNMF', dims = 1:50)
## 17:17:36 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:17:36 Read 5706 rows and found 50 numeric columns
## 17:17:36 Using Annoy for neighbor search, n_neighbors = 30
## 17:17:36 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:17:37 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c246b422
## 17:17:38 Searching Annoy index using 1 thread, search_k = 3000
## 17:17:39 Annoy recall = 100%
## 17:17:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:17:41 Initializing from normalized Laplacian + noise (using irlba)
## 17:17:41 Commencing optimization for 500 epochs, with 269200 positive edges
## 17:17:56 Optimization finished
DimPlot(so_mye,
group.by = 'celltype_full',
label = TRUE,
label.size = 5,
repel = TRUE)+
coord_equal()+
ggtitle('Myeloid')
Figure 6E+6F: Epithelial heatmap + ego
720 x 720 heatmap
540 x 540 emapplot
library(enrichplot)
so_epi <- subset(so_merge, subset = lineage == 'epithelial')
epi_markers <- read_csv('analysis_files/s4_epithelial_markers.csv')
## Rows: 5181 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_epi_markers <- epi_markers %>%
group_by(cluster) %>%
filter(p_val_adj <= 0.01) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = 5)
DoHeatmap(so_epi,
features = top_epi_markers$gene)+
ggtitle('Top epithelial genes')
DotPlot(so_epi,
features = top_epi_markers$gene,
group.by = 'celltype_full',
cols = 'RdYlBu')+
RotatedAxis()
epi_plots <- list()
for(i in unique(epi_markers$cluster)){
# Identify current DEG
curr_markers <- epi_markers %>%
filter(cluster == i) %>%
mutate(significantly_upregulated = avg_log2FC > 0.5 & p_val_adj <= 0.01)
# extract significant DEG
top_curr <- curr_markers %>%
filter(avg_log2FC >= .5 & p_val_adj <= 0.01)
ego_curr <- enrichGO(top_curr$gene,
OrgDb = org.Mm.eg.db,
keyType = 'SYMBOL',
ont = 'ALL')
ego_curr <- pairwise_termsim(ego_curr)
p1 <- ggplot(curr_markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = gene, color = significantly_upregulated))+
geom_point()+
ggrepel::geom_text_repel()+
theme_bw()+
ylab('Significance \n [ -log10(p_val_adj) ]')+
xlab('Average log2 Fold Change')+
ggtitle(paste0(i, ' vs all other epithelial'))+
theme(legend.position = 'bottom')+
labs(color = 'Significantly Upregulated')+
scale_color_manual(values = c('black', 'red'))
p2 <- treeplot(ego_curr)
p3 <- dotplot(ego_curr)
p4 <- emapplot(ego_curr,
cex_label_category = 0.5,
repel = TRUE,
group_category = FALSE)+
ggtitle(paste0('Cluster ', i, ' top ontologies'))
# display plots in loop
print(p1)
print(p2)
print(p3)
print(p4)
# stash plots for access later
epi_plots[[as.character(i)]]$volcano <- p1
epi_plots[[as.character(i)]]$tree <- p2
epi_plots[[as.character(i)]]$dot <- p3
epi_plots[[as.character(i)]]$emap <- p4
}
## Warning: ggrepel: 1044 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 933 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1489 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 413 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1161 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Figure 6G+6H: Heatmap of Fibroblast markers + Enrichment plot
dotplot: 600 x 480
so_fibro <- subset(so_merge, subset = lineage == 'fibroblast')
fibro_markers <- read_csv('analysis_files/s4_fibro_markers.csv')
## Rows: 2935 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_fibro_markers <- fibro_markers %>%
group_by(cluster) %>%
filter(p_val_adj <= 0.01) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = 10)
DoHeatmap(so_fibro,
features = top_fibro_markers$gene)+
ggtitle('Top fibroblast genes')
fibro_plots <- list()
for(i in unique(fibro_markers$cluster)){
# Identify current DEG
curr_markers <- fibro_markers %>%
filter(cluster == i) %>%
mutate(significantly_upregulated = avg_log2FC > 0.5 & p_val_adj <= 0.01)
# extract significant DEG
top_curr <- curr_markers %>%
filter(avg_log2FC >= .5 & p_val_adj <= 0.01)
# Enrichment of gene ontology using MSIGDB
ego_curr <- enrichGO(top_curr$gene,
OrgDb = org.Mm.eg.db,
keyType = 'SYMBOL',
ont = 'ALL')
ego_curr <- pairwise_termsim(ego_curr)
# Generate plots
p1 <- ggplot(curr_markers, aes(x = avg_log2FC, y = -log10(p_val_adj), label = gene, color = significantly_upregulated))+
geom_point()+
ggrepel::geom_text_repel()+
theme_bw()+
ylab('Significance \n [ -log10(p_val_adj) ]')+
xlab('Average log2 Fold Change')+
ggtitle(paste0(i, ' vs all other fibroblast'))+
theme(legend.position = 'bottom')+
labs(color = 'Significantly Upregulated')+
scale_color_manual(values = c('black', 'red'))
p2 <- treeplot(ego_curr)
p3 <- dotplot(ego_curr)
p4 <- emapplot(ego_curr,
cex_label_category = 0.5,
repel = TRUE,
group_category = FALSE)+
ggtitle(paste0('Cluster ', i, ' top ontologies'))
# display plots in loop
print(p1)
print(p2)
print(p3)
print(p4)
# stash plots for access later
fibro_plots[[as.character(i)]]$volcano <- p1
fibro_plots[[as.character(i)]]$tree <- p2
fibro_plots[[as.character(i)]]$dot <- p3
fibro_plots[[as.character(i)]]$emap <- p4
}
## Warning: ggrepel: 677 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 9 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 1175 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 986 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Figure 7:
Fig 7B: ScPred classification of murine clusters
900 x 800
so_merge_scpred <- readRDS('analysis_files/s5_so_merge_subset_mda_scpred.rds')
freq <- so_merge_scpred@meta.data %>%
dplyr::select(celltype_full, scpred_prediction) %>%
table()
row.names(freq) <- paste0('MM: ', row.names(freq))
colnames(freq) <- paste0('HS: ', colnames(freq))
row_anno <- so_merge_scpred@meta.data %>%
dplyr::select(lineage, celltype_full) %>%
distinct() %>%
mutate(celltype_full = paste0('MM: ', celltype_full))
row_anno <- data.frame(lineage = row_anno$lineage,
row.names = row_anno$celltype_full)
pheatmap::pheatmap(prop.table(freq, margin = 1),
annotation_row = row_anno,
main = 'scPred assignment \n Row fraction',
color = colorRampPalette(c('white', 'black'))(50),
annotation_colors = list(lineage = lineage_cols))
# ARI of matrix without epithelial cells
scpred_stroma <- so_merge_scpred@meta.data %>%
filter(lineage != 'epithelial')
aricode::ARI(c1 = scpred_stroma$celltype_full,
c2 = scpred_stroma$scpred_prediction)
## [1] 0.4082607
# ARI of only epithelial cells
scpred_epi <- so_merge_scpred@meta.data %>%
filter(lineage == 'epithelial')
aricode::ARI(c1 = scpred_epi$celltype_full,
c2 = scpred_epi$scpred_prediction)
## [1] 0.09099002
# Remove so to reclaim memory
rm(so_merge_scpred)
Figure 7c: Integrated data iNMF score vs celltype/cluster
Load integrated data and prepare metadata
so_rliger <- readRDS('analysis_files/s5_xspecies_uinmf_so.rds')
so_rliger@meta.data$species <- str_split(rownames(so_rliger@meta.data), pattern = '_', simplify = TRUE)[,1]
celltype_major_l1_dict <- rbind(c('Endothelial', 'endothelial'),
c('CAFs', 'fibroblast'),
c('PVL', 'perivascular'),
c('B-cells', 'lymphoid'),
c('T-cells', 'lymphoid'),
c('Myeloid', 'myeloid'),
c('Normal Epithelial', 'epithelial'),
c('Plasmablasts', 'lymphoid'),
c('Cancer Epithelial', 'epithelial'))
# Convert celltype_major to celltype_l1
so_rliger@meta.data$celltype_major_to_l1[so_rliger@meta.data$species == 'human'] <- plyr::mapvalues(so_rliger@meta.data$celltype_major[so_rliger@meta.data$species == 'human'],
from = celltype_major_l1_dict[,1],
to = celltype_major_l1_dict[,2])
# Combine celltype_l1 from each species
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
col = 'celltype_l1.5',
celltype_major_to_l1,
lineage,
na.rm = TRUE,
sep = '',
remove = FALSE)
# Combine celltype_l2 and celltype_minor
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
col = 'celltype_l2.5',
celltype_full,
celltype_minor,
na.rm = TRUE,
sep = '',
remove = FALSE)
# Combine human 'subtype' and mouse 'cellfrac_type'
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
col = 'tumor_subtype',
subtype,
cellfrac_type,
na.rm = TRUE,
sep = '',
remove = FALSE)
# Combine human 'Patient' and mouse 'cellfrac_type'
so_rliger@meta.data <- tidyr::unite(data = so_rliger@meta.data,
col = 'sample_id',
Patient,
tumor,
na.rm = TRUE,
sep = '',
remove = FALSE)
# Add species label to celltype_l2.5
so_rliger@meta.data$species_celltype <- paste0(recode(so_rliger@meta.data$species,
'mouse' = 'MM: ',
'human' = 'HS: '),
so_rliger@meta.data$celltype_l2.5)
species_celltype_dict <- so_rliger@meta.data %>%
dplyr::select(species, celltype_l2.5) %>%
distinct()
1200 x 1100
# Average iNMF embedding to show relationship to cluster, annotated by celltype_l1
cluster_embedding <- tibble(seurat_clusters = so_rliger@meta.data$species_celltype) %>%
cbind(so_rliger@reductions$inmf@cell.embeddings) %>%
group_by(seurat_clusters) %>%
summarize(across(everything(), list(mean)))
row_anno_tibble <- so_rliger@meta.data %>%
dplyr::select(species_celltype, celltype_l1.5, species) %>%
distinct()
row_anno <- data.frame(species = row_anno_tibble$species,
lineage = row_anno_tibble$celltype_l1.5,
row.names = row_anno_tibble$species_celltype)
pheatmap::pheatmap(t(scale(t(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
row.names = cluster_embedding$seurat_clusters)))),
scale = 'none',
main = 'Average UINMF embedding per cluster \n row z-scored',
annotation_row = row_anno,
annotation_colors = list(lineage = lineage_cols,
species = species_cols))
Figure 7D: Original cluster/celltype vs integrated cluster
1200 x 1200
clust_lineage <- so_rliger@meta.data %>%
dplyr::select(RNA_snn_res.0.6, celltype_l1.5) %>%
table()
clust_lineage_prop <- prop.table(clust_lineage, margin = 1)
for(i in 1:nrow(clust_lineage_prop)){
if(max(clust_lineage_prop[i,]) > 0.8){
curr_lineage <- names(which.max(clust_lineage_prop[i,]))
}else{
curr_lineage <- 'mixed'
}
if(i == 1){
cluster_lineage_dict <- tibble(cluster = row.names(clust_lineage_prop)[i],
consensus_lineage = curr_lineage)
}else{
cluster_lineage_dict <- rbind(cluster_lineage_dict,
tibble(cluster = row.names(clust_lineage_prop)[i],
consensus_lineage = curr_lineage))
}
}
consensus_lineage_cols <- RColorBrewer::brewer.pal(n = length(unique(cluster_lineage_dict$consensus_lineage)), 'Set1')
names(consensus_lineage_cols) <- unique(cluster_lineage_dict$consensus_lineage)
so_rliger_meta <- so_rliger@meta.data
freq_table <- so_rliger_meta %>%
dplyr::select(species_celltype, RNA_snn_res.0.6) %>%
table()
row_anno <- so_rliger_meta %>%
dplyr::select(species_celltype, species, celltype_l1.5) %>%
distinct()
row_anno <- data.frame(species = row_anno$species,
lineage = row_anno$celltype_l1.5,
row.names = row_anno$species_celltype)
row_anno$lineage <- factor(row_anno$lineage, levels = unique(cluster_lineage_dict$consensus_lineage))
col_anno <- data.frame(row.names = cluster_lineage_dict$cluster,
consensus_lineage = cluster_lineage_dict$consensus_lineage)
anno_colors <- list(lineage = lineage_cols,
consensus_lineage = consensus_lineage_cols,
species = species_cols)
col_order <- cluster_lineage_dict %>%
arrange(consensus_lineage, cluster)
pheatmap::pheatmap(prop.table(freq_table, margin = 1),
annotation_row = row_anno,
annotation_col = col_anno,
annotation_colors = anno_colors,
color = colorRampPalette(colors = c('white', 'black'))(100),
main = 'Original identity versus integrated cluster \n (Row fraction)',
cluster_cols = TRUE)
Figure 7E: Integrated UMAP colored by lineage
DimPlot(so_rliger,
group.by = 'celltype_l1.5',
label = FALSE,
repel = TRUE,
cols = lineage_color_dict$color)+
coord_equal()+
ggtitle('Lineage')+
theme(plot.title = element_text(size = 25, face = 'bold'),
legend.text = element_text(face = 'bold'))+
guides(color = guide_legend(override.aes = list(size = 7)))
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
Figure 7F: UMAP split by species
480 x 480 per umap
# Randomly subset to an equivalent number
species_count <- table(so_rliger@meta.data$species)
mouse_barcodes <- so_rliger@meta.data %>%
mutate(cell_id = rownames(so_rliger@meta.data)) %>%
filter(species == 'mouse') %>%
dplyr::select(cell_id) %>%
pull()
set.seed(6)
human_barcodes <- so_rliger@meta.data %>%
mutate(cell_id = rownames(so_rliger@meta.data)) %>%
filter(species == 'human') %>%
mutate(random_n = sample(species_count[1])) %>%
filter(random_n <= species_count[2]) %>%
dplyr::select(cell_id) %>%
pull()
so_rliger_subset <- subset(so_rliger, cells = c(mouse_barcodes, human_barcodes))
DimPlot(so_rliger_subset,
group.by = 'RNA_snn_res.0.6',
label = TRUE,
label.size = 5,
split.by = 'species')+
coord_equal()+
theme(legend.position = 'none')+
ggtitle('Unsupervised clusters \n (subset to equal n)')
Supplementary Figure 13:
S13D: Integrated scRNAseq Lineage vs fraction of cluster
1200 x 300
clust_lineage <- so_rliger@meta.data %>%
dplyr::select(RNA_snn_res.0.6, celltype_l1.5) %>%
table()
clust_lineage_prop <- prop.table(clust_lineage, margin = 1)
pheatmap::pheatmap(t(clust_lineage_prop),
display_numbers = round(t(clust_lineage_prop),2),
main = 'Fraction of lineage per cluster',
color = colorRampPalette(colors = c('white', 'black'))(100))
for(i in 1:nrow(clust_lineage_prop)){
if(max(clust_lineage_prop[i,]) > 0.8){
curr_lineage <- names(which.max(clust_lineage_prop[i,]))
}else{
curr_lineage <- 'mixed'
}
if(i == 1){
cluster_lineage_dict <- tibble(cluster = row.names(clust_lineage_prop)[i],
consensus_lineage = curr_lineage)
}else{
cluster_lineage_dict <- rbind(cluster_lineage_dict,
tibble(cluster = row.names(clust_lineage_prop)[i],
consensus_lineage = curr_lineage))
}
}
Patient breakdown
meta_human <- so_rliger@meta.data %>%
filter(species == 'human') %>%
mutate(og_label = celltype_minor) %>%
mutate(sample_id = Patient)
meta_human$celltype_l1 <- plyr::mapvalues(x = meta_human$celltype_major,
from = celltype_major_l1_dict[,1],
to = celltype_major_l1_dict[,2])
meta_mouse <- so_rliger@meta.data %>%
filter(species == 'mouse') %>%
mutate(og_label = celltype_full) %>%
mutate(sample_id = tumor)
meta <- rbind(meta_mouse, meta_human)
freq_table <- meta %>%
dplyr::select(RNA_snn_res.0.9, sample_id) %>%
table()
col_anno <- meta %>%
dplyr::select(sample_id, subtype, cellfrac_type, species) %>%
distinct()
col_anno <- data.frame(subtype = col_anno$subtype,
cellfrac_type = col_anno$cellfrac_type,
species = col_anno$species,
row.names = col_anno$sample_id)
pheatmap::pheatmap(prop.table(freq_table, margin = 2),
annotation_col = col_anno)
# Supplementary figure 13
S13A - UMAP comparison - no integration, harmony and iNMF
all: 500 x 500
# Harmony
so_harmony <- RunUMAP(so_merge, dims = 1:50, reduction = 'harmony')
## 17:24:23 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:24:23 Read 14042 rows and found 50 numeric columns
## 17:24:23 Using Annoy for neighbor search, n_neighbors = 30
## 17:24:23 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:24:25 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c39dc102e
## 17:24:25 Searching Annoy index using 1 thread, search_k = 3000
## 17:24:29 Annoy recall = 100%
## 17:24:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:24:31 Initializing from normalized Laplacian + noise (using irlba)
## 17:24:33 Commencing optimization for 200 epochs, with 660676 positive edges
## 17:24:47 Optimization finished
harmony_umap <- DimPlot(so_harmony,
group.by = 'tumor')
rm(so_harmony)
# No integration
so_noint <- RunPCA(so_merge)
## PC_ 1
## Positive: Cd74, Hmgb2, H2-Ab1, Wfdc17, Lyz2, Bcl2a1b, C1qa, C1qb, S100a9, S100a8
## C1qc, Cd3g, Ccrl2, Clec4d, Clec4e, Ms4a7, Trbc2, Il1rn, Nlrp3, Igkc
## Gatm, AW112010, Vps37b, Itgae, Ccl4, Irf8, Trem1, Cx3cr1, Ifitm1, Apoe
## Negative: Fstl1, Col3a1, Aebp1, Dcn, Serping1, Col1a2, Fbn1, Mmp2, Sparc, Col5a2
## Col1a1, Lum, Mfap5, Rarres2, Adamts5, C1s1, Fn1, Bgn, Serpina3n, Loxl1
## Serpinh1, Ccdc80, Col5a1, Tnfaip6, Col6a1, Ogn, Clec3b, Thbs2, Serpinf1, Cpxm1
## PC_ 2
## Positive: Epcam, Fxyd3, Wfdc18, Krt8, Krt18, Lcn2, Plet1, Cldn7, Csn3, Nfib
## Ehf, Cldn3, Spint2, Dbi, Aqp5, Sox9, Igfbp5, Trf, Elf5, Folr1
## Tspan8, Smim22, Apoc1, Ano1, Phlda1, Mfge8, Ptprf, Me1, Rab25, Irx2
## Negative: Fth1, Cd74, Wfdc17, H2-Ab1, Lyz2, Apoe, Bcl2a1b, C1qa, C1qb, C1qc
## S100a9, Ifitm1, Clec4d, Ccrl2, Clec4e, Ms4a7, Gsn, Igfbp6, S100a8, Hmgb2
## Nlrp3, Cx3cr1, AW112010, Cxcl2, Il1r2, Il1rn, Ccl4, Slfn5, Trem1, Cd3g
## PC_ 3
## Positive: Fth1, Cxcl2, Ier3, Basp1, Lcn2, S100a8, S100a9, C3, Clec4e, Clec4d
## G0s2, Wfdc18, Ccrl2, Nlrp3, Wfdc17, Il1rn, Cstb, Igfbp6, Mt1, Cd24a
## Il1r2, Mgst1, Pi16, Krt18, Trem1, Tnfaip6, Ifitm1, Plet1, Acod1, Hp
## Negative: Cdh5, Egfl7, Gpihbp1, Flt1, Emcn, Plvap, Aqp1, Esam, Pecam1, Ptprb
## Kdr, Igfbp3, Col4a2, Adgrf5, Col4a1, Adgrl4, Mmrn2, Sox18, Gng11, Cd93
## Eng, Ednrb, Cldn5, Sparcl1, Sema6d, Sema3f, Mcam, Ramp2, Ecscr, Podxl
## PC_ 4
## Positive: Cd3g, AW112010, Trbc2, Rps2, Il2rb, Ctsw, Itgae, Hsp90ab1, Nkg7, Ndufa4
## Trac, Trbc1, Top2a, Ikzf2, Icos, Cd7, Pclaf, Xcl1, Dut, Txk
## Stmn1, Mki67, Eef1g, Smc2, Tcrg-C1, Gzmb, Prc1, Il2ra, Klrk1, Trdc
## Negative: S100a9, S100a8, Clec4d, Clec4e, Cxcl2, G0s2, Nlrp3, Ifitm1, Acod1, Ccrl2
## Trem1, Il1rn, Wfdc21, Hcar2, Ier3, Il1f9, Wfdc17, Il1r2, Lrg1, Cstdc4
## Upp1, Stfa2l1, Slfn4, Basp1, Ccl3, Retnlg, Ptgs2, Asprv1, Fth1, Hp
## PC_ 5
## Positive: Postn, Matn4, Il17b, Krt5, Tpm2, Ecrg4, Col11a1, Slpi, Igfbp2, Col8a1
## Krt14, Col7a1, Col16a1, Acta2, Tagln, Col12a1, Col17a1, Bgn, Myl9, Hmcn1
## Grb14, Trp63, Gas1, Mgp, Lgr5, Moxd1, Tnc, Megf6, Pdgfrl, Samd5
## Negative: Ly6c1, Pcolce2, Cd34, Tnxb, Clec3b, Efemp1, Itih5, Opcml, Has1, Dpp4
## Cadm3, Pi16, Scara5, Sfrp4, Adgrd1, Efhd1, Gfpt2, Slc4a4, Uap1, Ackr3
## Heg1, Pla1a, Lrrn4cl, Ifi205, Pcsk6, Cd55, C4b, Tmem100, Tnfaip6, Gas7
so_noint <- RunUMAP(so_noint, dims = 1:50, reduction = 'pca')
## 17:24:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:24:55 Read 14042 rows and found 50 numeric columns
## 17:24:55 Using Annoy for neighbor search, n_neighbors = 30
## 17:24:55 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:24:57 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c42705ec2
## 17:24:57 Searching Annoy index using 1 thread, search_k = 3000
## 17:25:00 Annoy recall = 100%
## 17:25:01 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:25:03 Initializing from normalized Laplacian + noise (using irlba)
## 17:25:04 Commencing optimization for 200 epochs, with 627536 positive edges
## 17:25:19 Optimization finished
noint_umap <- DimPlot(so_noint,
group.by = 'tumor')+
coord_equal()
rm(so_noint)
# iNMF
inmf_umap <- DimPlot(so_merge,
group.by = 'tumor')+
coord_equal()
noint_umap+
ggtitle('No integration \n 50 PCs')
harmony_umap+
ggtitle('Harmony integrated \n 50 components')
inmf_umap+
ggtitle('rLiger integrated \n 50 Factors')
S13B - clustering optimization
Created in s3_clustering_optimization.rmd
S13C - cluster vs lineage marker dotplot
1080x480 pixels
DotPlot(so_merge,
features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Mki67', 'Pdgfrb'),
cols = 'RdYlBu',
group.by = 'cluster_l2')+
theme_bw()+
coord_flip()
S13D/E - c14 subset: umap & cluster vs lineage marker dotplot
480 x 480 both
library(harmony)
## Loading required package: Rcpp
c14 <- subset(so_merge,
subset = seurat_clusters == 14)
c14 <- FindVariableFeatures(c14)
c14 <- NormalizeData(c14)
c14 <- ScaleData(c14)
## Centering and scaling data matrix
c14 <- RunPCA(c14)
## PC_ 1
## Positive: Tmsb10, Trbc2, Sept1, Cd3g, Ablim1, Cd3d, Ptprcap, Trac, Skap1, S100a10
## Il2rb, Ctsw, Ctla2a, Pdcd4, Ltb, Lck, S100a6, Rora, Thy1, Nkg7
## Cd226, Cxcr6, Dut, Sh2d2a, Trbc1, Ikzf3, Il18r1, Itgae, Inpp4b, Ahnak
## Negative: C1qc, Ctss, C1qa, C1qb, Mpeg1, Cx3cr1, Apoe, Csf1r, Aif1, Lgmn
## Cd74, Ctsc, Ms4a7, Cxcl16, Cst3, Ctsz, Hexb, Gatm, Cybb, Tyrobp
## Tgfbi, Lyz2, Cd68, Ctsb, Pld4, Psap, Ftl1, H2-Ab1, Fcer1g, Man2b1
## PC_ 2
## Positive: Lcn2, Wfdc18, Csn3, Epcam, Nfib, Fxyd3, Trf, Cldn7, Cldn3, Igfbp5
## Cald1, Krt8, Dbi, Krt18, Aqp5, Sox9, Plet1, Sox4, Phlda1, Nfix
## Nedd4, Scgb2b27, Auts2, Apoc1, Tpm1, Cd9, Fermt2, Chil1, Ptprf, Cd24a
## Negative: AW112010, Srgn, Ltb, Crip1, Ptprcap, Ctsw, Vim, Cd3g, Ablim1, Sept1
## Il2rb, Nkg7, S100a4, Trbc2, Gimap4, Skap1, Ifngr1, Rgs1, Itgae, S100a10
## Cd226, Inpp4b, Bcl2, Lck, Esyt1, H2-Q7, Ptpn22, Cd3d, Atad2, Gzmb
## PC_ 3
## Positive: AY036118, Igkc, Maf, Il7r, Gm26917, Ccnb1ip1, Icos, Apoe, Cxcr6, Cd6
## Rora, Lst1, Cd74, Il2ra, Ms4a4b, Rmrp, Lgals1, Hbb-bs, Trac, Hba-a1
## Ly6a, Ccr2, Cd3d, Cd4, Il18r1, Hbb-bt, Ttc39c, H2-Aa, Ifi27l2a, Thy1
## Negative: Cemip2, Gzmb, Car2, Xcl1, Hsp90ab1, Rps2, Klra5, Ier5l, Ctla2b, Junb
## Rps18, Cdv3, Npm1, Hspd1, Fasl, Nme2, Nme1, Eif5a, Errfi1, Rpl12
## Sult2b1, Cebpb, Eef1b2, Nr4a2, Rps3, Eef1g, Klrk1, Itih5, Sfr1, Cited4
## PC_ 4
## Positive: Lgals1, Cavin3, Maf, S100a4, S100a10, Rhobtb3, Tmem64, Serpinb1a, Palld, Cav1
## Trdc, Cd163l1, Il7r, Rora, Ptpn14, Col6a1, Ramp1, Capg, Ppic, Phlda3
## Fat1, Igfbp3, Trdv4, Cx3cl1, Tpm2, Timp1, Krt15, Cd3d, Fkbp9, Ikzf2
## Negative: Xcl1, Scgb2b27, Klrd1, Nr4a2, Scgb1b27, Cemip2, Gzmb, Ncr1, Serpinb9, Klra5
## Car6, Ctla2a, Etv1, Wfdc18, Gzma, Vps37b, Serpinb6b, Chn2, Errfi1, Aqp5
## Neurl3, Ppp1r3b, Cd7, Klre1, Nkg7, Hba-a1, Hbb-bs, Gzmc, Ctla2b, Lcn2
## PC_ 5
## Positive: Krt15, Dsc3, Krt5, Krt14, Sparc, Fat1, Fkbp9, Vcan, Igsf10, Ppic
## Tagln, Slpi, Trp63, Cav1, Acta2, Postn, S100a16, Serpine2, A430105J06Rik, Igfbp3
## Hmcn1, Cxcl14, Phlda3, Sfrp1, Brinp3, Col4a1, Tpm2, Palld, Col6a1, Gas1
## Negative: Nme1, Nme2, Chchd10, Kcnk1, Rplp0, Rps3, Elf5, Atp2c2, 4931406C07Rik, Cldn7
## Cystm1, Kcnn4, Eef1b2, Ppp1r2, Wfdc18, Npm1, Prss8, Rpl12, 1110008P14Rik, Erh
## Nhp2, Ass1, Epcam, Rpsa, Smim22, Cd24a, Plet1, Rab25, Cldn3, Ehf
c14 <- RunHarmony(c14,
group.by.vars = 'library_id')
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony converged after 7 iterations
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.RNA.harmony; see ?make.names for more details
## on syntax validity
ElbowPlot(c14, reduction = 'harmony')
c14 <- RunUMAP(c14, reduction = 'harmony', dims = 1:5,
seed.use = 500)
## 17:25:29 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:25:29 Read 340 rows and found 5 numeric columns
## 17:25:29 Using Annoy for neighbor search, n_neighbors = 30
## 17:25:29 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:25:29 Writing NN index file to temp file C:\Users\calistri\AppData\Local\Temp\1\RtmpE9uzsI\file7a0c5d9cd8c
## 17:25:29 Searching Annoy index using 1 thread, search_k = 3000
## 17:25:29 Annoy recall = 100%
## 17:25:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:25:32 Initializing from normalized Laplacian + noise (using irlba)
## 17:25:32 Commencing optimization for 500 epochs, with 11592 positive edges
## 17:25:33 Optimization finished
DimPlot(c14,
label = TRUE,
label.size = 6)+
ggtitle('Cluster 14 \n (5 Harmony components)')
DotPlot(c14,
features = c('Ptprc', 'Cd74', 'Epcam', 'Cd14', 'Cd3e', 'Pdgfra', 'Krt5', 'Pecam1', 'Pdgfrb'),
cols = 'RdYlBu',
group.by = 'cluster_l2')+
theme_bw()+
coord_flip()
## Warning: Scaling data with a low number of groups may produce misleading
## results
S13F: UMAP by cellfrac_type
1080x540
# Find smallest number of cells per cellfrac type
n_per_cellfrac_type <- so_merge@meta.data %>%
group_by(cellfrac_type) %>%
summarize(n_cells = n())
n_smallest <- min(n_per_cellfrac_type$n_cells)
# Randomly subsample each cellfrac type
set.seed(7)
sp_barcodes <- so_merge@meta.data %>%
mutate(cell_id = rownames(so_merge@meta.data)) %>%
filter(cellfrac_type == 'SP') %>%
mutate(random_n = sample(nrow(.))) %>%
filter(random_n <= n_smallest) %>%
dplyr::select(cell_id) %>%
pull()
set.seed(8)
srip_barcodes <- so_merge@meta.data %>%
mutate(cell_id = rownames(so_merge@meta.data)) %>%
filter(cellfrac_type == 'SR_IP') %>%
mutate(random_n = sample(nrow(.))) %>%
filter(random_n <= n_smallest) %>%
dplyr::select(cell_id) %>%
pull()
set.seed(9)
srir_barcodes <- so_merge@meta.data %>%
mutate(cell_id = rownames(so_merge@meta.data)) %>%
filter(cellfrac_type == 'SR_IR') %>%
mutate(random_n = sample(nrow(.))) %>%
filter(random_n <= n_smallest) %>%
dplyr::select(cell_id) %>%
pull()
# Plot UMAP by cellfrac type
DimPlot(subset(so_merge, cells = c(sp_barcodes, srip_barcodes, srir_barcodes)),
group.by = 'lineage',
label = FALSE,
split.by = 'cellfrac_type',
pt.size = 1.5)+
coord_equal()+
theme(legend.position = 'bottom')+
ggtitle('Lineage by cellfrac_type \n (subset to equal n)')+
guides(color = guide_legend(nrow = 1, override.aes = list(size = 5)))
Supplemental 13G
960 x 480 pixels
so_merge_meta <- so_merge@meta.data %>%
mutate(tumor_pheno = paste0(tumor, '-', histology))
freq_table <- so_merge_meta %>%
dplyr::select(lineage, tumor_pheno) %>%
table()
cellfrac_type_dict <- so_merge_meta %>%
dplyr::select(cellfrac_type, tumor_pheno) %>%
distinct()
col_anno <- tibble(histology = str_split(colnames(freq_table), pattern = '-', simplify = TRUE)[,2],
log10_n_cells = log10(colSums(freq_table)),
tumor_pheno = colnames(freq_table))
col_anno <- full_join(x = col_anno,
y = cellfrac_type_dict,
by = 'tumor_pheno',
suffix = c('', ''))
col_anno <- data.frame(histology = col_anno$histology,
cellfrac_type = col_anno$cellfrac_type,
log10_n_cells = col_anno$log10_n_cells,
row.names = col_anno$tumor_pheno)
cellfrac_cols <- scales::hue_pal()(3)
names(cellfrac_cols) <- c('SP', 'SR_IP', 'SR_IR')
anno_colors <- list(cellfrac_type = cellfrac_cols,
histology = c(SP = 'magenta', SR = 'cyan'))
pheatmap::pheatmap(prop.table(freq_table, margin = 2),
display_numbers = freq_table,
main = 'column scaled (cell fraction) \n Number = n_cells per celltype per tumor',
annotation_col = col_anno,
annotation_colors = anno_colors)
Supplementary Figure 14
S14A - Top DEG dotplot
1920 x 720
markers <- read_csv('analysis_files/s3_cluster_l2_markers.csv')
## Rows: 43572 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, cluster
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top_markers <- markers %>%
filter(p_val_adj <= 0.01) %>%
filter(pct.1 >= 0.5) %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = 3)
DotPlot(so_merge,
features = unique(top_markers$gene),
cols = 'RdYlBu',
group.by = 'celltype_full')+
RotatedAxis()
S14B/C - NMF score vs lineage/cluster
lineage vs iNMF embedding: 1080 x 480
cluster vs iNMF embedding: 1080 x 600
# Average iNMF embeddings to show relationship to celltype_l1
cluster_embedding <- tibble(lineage = so_merge@meta.data$lineage) %>%
cbind(so_merge@reductions$iNMF@cell.embeddings) %>%
group_by(lineage) %>%
summarize(across(everything(), list(mean)))
pheatmap::pheatmap(t(scale(t(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
row.names = cluster_embedding$lineage)))),
scale = 'none',
main = 'Average iNMF embedding per cluster \n Z-scored by row')
# Average iNMF embedding to show relationship to cluster, annotated by celltype_l1
cluster_embedding <- tibble(celltype_full = so_merge@meta.data$celltype_full) %>%
cbind(so_merge@reductions$iNMF@cell.embeddings) %>%
group_by(celltype_full) %>%
summarize(across(everything(), list(mean)))
annotation_rows <- so_merge@meta.data %>%
dplyr::select(lineage, celltype_full) %>%
group_by_all() %>%
distinct()
annotation_rows <- data.frame(celltype = annotation_rows$lineage,
row.names = annotation_rows$celltype_full)
pheatmap::pheatmap(t(scale(t(data.frame(cluster_embedding[,2:ncol(cluster_embedding)],
row.names = cluster_embedding$celltype_full)))),
scale = 'none',
main = 'Average iNMF embedding per cluster \n Z-scored by row',
annotation_row = annotation_rows)
S14D: Fraction of tumor by lineage (only lineages with >1 cluster)
so_meta <- so_merge@meta.data
subtype_counts <- so_meta %>%
dplyr::select(tumor, cellfrac_type) %>%
distinct() %>%
group_by(cellfrac_type) %>%
summarize(n_tumors = n())
so_meta_tumors <- so_meta %>%
filter(lineage %in% c('epithelial', 'fibroblast', 'lymphoid', 'myeloid')) %>%
dplyr::select(celltype_full, cellfrac_type, lineage, tumor) %>%
group_by(tumor, cellfrac_type, celltype_full, lineage) %>%
summarize(n_cells = n()) %>%
group_by(tumor, lineage) %>%
mutate(freq = n_cells/sum(n_cells))
## `summarise()` has grouped output by 'tumor', 'cellfrac_type', 'celltype_full'.
## You can override using the `.groups` argument.
so_meta_tumors_grouped <- full_join(x = so_meta_tumors,
y = subtype_counts,
by = 'cellfrac_type') %>%
group_by(cellfrac_type, celltype_full) %>%
mutate(mean_freq = mean(freq)) %>%
mutate(freq_sem_ymin = mean(freq) - sd(freq)/(n_tumors^0.5)) %>%
mutate(freq_sem_ymax = mean(freq) + sd(freq)/(n_tumors^0.5))
so_meta_tumors_summary <- so_meta_tumors %>%
group_by(cellfrac_type, celltype_full) %>%
summarize(tumor_count = n(),
mean_freq = mean(freq),
freq_sd_ymin = mean(freq) - sd(freq),
freq_sd_ymax = mean(freq) + sd(freq),
freq_sem_ymin = mean(freq) - sd(freq)/(tumor_count)^0.5,
freq_sem_ymax = mean(freq) + sd(freq)/(tumor_count)^0.5,
lineage = lineage)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## `summarise()` has grouped output by 'cellfrac_type', 'celltype_full'. You can
## override using the `.groups` argument.
# mean freq of tumor
ggplot(so_meta_tumors_summary, aes(y = mean_freq, x = celltype_full, fill = cellfrac_type))+
geom_col(position = 'dodge')+
geom_point(data = so_meta_tumors, aes(x = celltype_full, y = freq), color = 'gray50', size = 1.5)+
theme_bw()+
geom_errorbar(aes(x = celltype_full, ymin = freq_sem_ymin, ymax = freq_sem_ymax), position = 'dodge', width = 0.5)+
facet_grid(rows = vars(cellfrac_type), cols = vars(lineage), scales = 'free_x', space = 'free_x')+
theme(strip.text.x = element_text(angle = 90))+
ylab('Mean frequency \n (fraction of tumor)')+
xlab('Celltype \n (cluster-celltype)')+
RotatedAxis()
S14D - Cluster distribution per tumor
1600 x 720
so_meta <- so_merge@meta.data
so_meta_tumors2 <- so_meta %>%
dplyr::select(celltype_full, cellfrac_type, lineage, tumor) %>%
group_by(tumor, cellfrac_type, celltype_full, lineage) %>%
summarize(n_cells = n()) %>%
group_by(tumor) %>%
mutate(freq = n_cells/sum(n_cells))
## `summarise()` has grouped output by 'tumor', 'cellfrac_type', 'celltype_full'.
## You can override using the `.groups` argument.
# Order tumors by subtype
tumor_order <- so_meta_tumors2 %>%
ungroup() %>%
dplyr::select(tumor, cellfrac_type) %>%
distinct() %>%
arrange(cellfrac_type) %>%
dplyr::select(tumor) %>%
pull()
# Add tumor order as factor to tumor id & find frequency of cluster per tumor
so_meta_tumors2 <- so_meta_tumors2 %>%
mutate(tumor = factor(tumor, levels = tumor_order)) %>%
group_by(tumor) %>%
mutate(freq = n_cells/sum(n_cells))
# Plot
ggplot(so_meta_tumors2, aes(x = freq, y = celltype_full, fill = cellfrac_type))+
geom_col()+
theme_bw()+
facet_grid(lineage~tumor, scales = 'free_y', space = 'free')+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0))+
theme(plot.margin = unit(c(1,1,1,1), 'cm'))+
ylab('Cluster')+
xlab('Frequency of cluster in tumor')
S14E: Epithelial biomarker expression
so_epi <- subset(so_merge,
subset = lineage == 'epithelial')
Idents(so_epi) <- 'celltype_full'
krts <- grep(rownames(so_epi),
pattern = '^Krt',
value = TRUE)
goi <- c(krts,
'Epcam',
'Vim',
'Cd24',
'Cd44')
goi_exp <- AverageExpression(so_epi,
features = goi)$RNA
## Warning: The following 1 features were not found in the RNA assay: Cd24
## Warning: The following 117 features were not found in the HTO assay:
## Krtap28-10, Krtap28-13, Krtcap2, Krtcap3, Krtdap, Krtap5-2, Krtap5-3, Krtap5-5,
## Krtap5-1, Krtap5-4, Krtap12-1, Krtap10-4, Krtap10-10, Krt222, Krt24, Krt25,
## Krt26, Krt27, Krt28, Krt10, Krt12, Krt20, Krt23, Krt39, Krt40, Krtap3-3,
## Krtap3-2, Krtap3-1, Krtap1-5, Krtap1-4, Krtap1-3, Krtap9-3, Krtap2-4, Krtap4-1,
## Krtap4-2, Krtap4-7, Krtap4-6, Krtap4-8, Krtap4-9, Krtap4-13, Krtap4-16,
## Krtap9-1, Krtap31-1, Krtap31-2, Krtap9-5, Krtap29-1, Krtap16-1, Krtap17-1,
## Krt33a, Krt33b, Krt34, Krt31, Krt32, Krt35, Krt36, Krt13, Krt15, Krt19, Krt9,
## Krt14, Krt16, Krt17, Krt42, Krt80, Krt7, Krt87, Krt88, Krt81, Krt86, Krt83,
## Krt84, Krt82, Krt90, Krt75, Krt6b, Krt6a, Krt5, Krt71, Krt72, Krt73, Krt2,
## Krt1, Krt77, Krt76, Krt4, Krt79, Krt78, Krt8, Krt18, Krtap24-1, Krtap26-1,
## Krtap27-1, Krtap13-1, Krtap13, Krtap14, Krtap15, Krtap19-1, Krtap19-2,
## Krtap19-3, Krtap19-4, Krtap19-5, Krtap19-9b, Krtap16-3, Krtap22-2, Krtap6-1,
## Krtap6-5, Krtap6-3, Krtap20-2, Krtap21-1, Krtap6-2, Krtap8-1, Krtap7-1,
## Krtap11-1, Epcam, Vim, Cd24, Cd44
## Warning: None of the features specified were found in the HTO assay.
goi_exp <- goi_exp[rowSums(goi_exp) > 1, ]
pheatmap::pheatmap(t(goi_exp),
color = colorRampPalette(c('purple', 'black', 'yellow'))(50),
main = 'Luminal-basal marker expression',
scale = 'column')
Supplemental Figure 15
S15A: UINMF jaccard index
560 x 860
top_n <- 50
n_shared_factors <- ncol(so_rliger@reductions$inmf@feature.loadings)
inmf_top_features <- list()
for(i in 1:n_shared_factors){
curr_features <- so_rliger@reductions$inmf@feature.loadings[,i]
top_features <- sort(curr_features, decreasing = TRUE)[1:top_n]
inmf_top_features[[i]] <- names(top_features)
}
inmf_jaccard <- array(dim = c(n_shared_factors, n_shared_factors))
rownames(inmf_jaccard) <- colnames(inmf_jaccard) <- 1:n_shared_factors
for(i in 1:n_shared_factors){
for(j in 1:n_shared_factors){
inmf_jaccard[i,j] <- length(intersect(inmf_top_features[[i]], inmf_top_features[[j]]))/length(union(inmf_top_features[[i]], inmf_top_features[[j]]))
}
}
pheatmap::pheatmap(inmf_jaccard,
color = colorRampPalette(colors = c('white', 'red'))(100),
breaks = seq(from = 0, to = 0.5, by = 0.005),
main = 'Jaccard similarity of UINMF shared-feature factor loadings',
cutree_rows = 7,
cutree_cols = 7)
S15B: Average UINMF embedding per lineage
lineage_embedding <- tibble(lineage = so_rliger@meta.data$celltype_l1.5) %>%
cbind(so_rliger@reductions$inmf@cell.embeddings) %>%
group_by(lineage) %>%
summarize(across(everything(), list(mean)))
pheatmap::pheatmap(scale(t(data.frame(lineage_embedding[,2:ncol(lineage_embedding)],
row.names = lineage_embedding$lineage))),
scale = 'none',
main = 'Average UINMF embedding per lineage \n row z-scored',)
S15C: Clustering optimization
Figure produced in script S5_human_tnbc_integration_rliger
S15D: Fraction of lineage per cluster
1200 x 300
clust_lineage <- so_rliger@meta.data %>%
dplyr::select(RNA_snn_res.0.6, celltype_l1.5) %>%
table()
clust_lineage_prop <- prop.table(clust_lineage, margin = 1)
pheatmap::pheatmap(t(clust_lineage_prop),
display_numbers = round(t(clust_lineage_prop),2),
main = 'Fraction of lineage per cluster',
color = colorRampPalette(colors = c('white', 'black'))(100))
S15E: Sample composition vs unsupervised cluster
freqs <- so_rliger_meta %>%
dplyr::select(sample_id, seurat_clusters) %>%
table()
col_anno <- data.frame(row.names = cluster_lineage_dict$cluster,
lineage = cluster_lineage_dict$consensus_lineage)
row_anno <- so_rliger_meta %>%
dplyr::select(sample_id, species, tumor_subtype) %>%
distinct()
row_anno <- data.frame(row_anno[,-1],
row.names = row_anno$sample_id)
anno_colors <- list(lineage = lineage_cols)
col_order <- cluster_lineage_dict %>%
arrange(consensus_lineage, cluster)
pheatmap::pheatmap(prop.table(freqs, margin = 1),
annotation_col = col_anno,
annotation_row = row_anno,
color = colorRampPalette(colors = c('white', 'black'))(100),
main = ('Sample composition \n Sample ID vs unsupervised cluster \n (Row Fraction)'))
sessionInfo()
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] harmony_0.1.1 Rcpp_1.0.9 enrichplot_1.16.2
## [4] org.Mm.eg.db_3.15.0 AnnotationDbi_1.58.0 IRanges_2.32.0
## [7] S4Vectors_0.36.0 Biobase_2.58.0 BiocGenerics_0.44.0
## [10] clusterProfiler_4.4.4 SeuratObject_4.1.3 Seurat_4.3.0.9001
## [13] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
## [16] dplyr_1.1.0 purrr_1.0.1 readr_2.1.4
## [19] tidyr_1.3.0 tibble_3.1.8 ggplot2_3.4.1
## [22] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 spatstat.explore_3.0-6 reticulate_1.28
## [4] tidyselect_1.2.0 RSQLite_2.3.0 htmlwidgets_1.6.1
## [7] grid_4.2.2 BiocParallel_1.32.4 Rtsne_0.16
## [10] scatterpie_0.1.8 munsell_0.5.0 ragg_1.2.5
## [13] codetools_0.2-19 ica_1.0-3 future_1.32.0
## [16] miniUI_0.1.1.1 withr_2.5.0 spatstat.random_3.1-3
## [19] colorspace_2.0-3 GOSemSim_2.22.0 progressr_0.13.0
## [22] highr_0.10 knitr_1.42 rstudioapi_0.14
## [25] ROCR_1.0-11 tensor_1.5 DOSE_3.22.1
## [28] listenv_0.9.0 labeling_0.4.2 GenomeInfoDbData_1.2.9
## [31] polyclip_1.10-4 pheatmap_1.0.12 bit64_4.0.5
## [34] farver_2.1.1 downloader_0.4 treeio_1.20.2
## [37] parallelly_1.34.0 vctrs_0.5.2 generics_0.1.3
## [40] xfun_0.37 timechange_0.1.1 R6_2.5.1
## [43] GenomeInfoDb_1.34.4 ggbeeswarm_0.7.1 graphlayouts_0.8.4
## [46] rmdformats_1.0.4 bitops_1.0-7 spatstat.utils_3.0-2
## [49] cachem_1.0.6 fgsea_1.22.0 gridGraphics_0.5-1
## [52] vroom_1.6.0 promises_1.2.0.1 scales_1.2.1
## [55] ggraph_2.1.0 beeswarm_0.4.0 gtable_0.3.1
## [58] globals_0.16.2 goftest_1.2-3 tidygraph_1.2.3
## [61] rlang_1.0.6 systemfonts_1.0.4 splines_4.2.2
## [64] lazyeval_0.2.2 spatstat.geom_3.0-6 yaml_2.3.6
## [67] reshape2_1.4.4 abind_1.4-5 httpuv_1.6.8
## [70] qvalue_2.28.0 tools_4.2.2 bookdown_0.33
## [73] ggplotify_0.1.0 ellipsis_0.3.2 jquerylib_0.1.4
## [76] RColorBrewer_1.1-3 ggridges_0.5.4 plyr_1.8.7
## [79] zlibbioc_1.44.0 RCurl_1.98-1.10 deldir_1.0-6
## [82] pbapply_1.7-0 viridis_0.6.2 cowplot_1.1.1
## [85] zoo_1.8-11 ggrepel_0.9.2 cluster_2.1.4
## [88] magrittr_2.0.3 data.table_1.14.6 scattermore_0.8
## [91] DO.db_2.9 lmtest_0.9-40 RANN_2.6.1
## [94] ggnewscale_0.4.8 fitdistrplus_1.1-8 matrixStats_0.63.0
## [97] hms_1.1.2 patchwork_1.1.2 mime_0.12
## [100] evaluate_0.20 xtable_1.8-4 gridExtra_2.3
## [103] compiler_4.2.2 shadowtext_0.1.2 KernSmooth_2.23-20
## [106] crayon_1.5.2 htmltools_0.5.4 ggfun_0.0.9
## [109] later_1.3.0 tzdb_0.3.0 aplot_0.1.10
## [112] DBI_1.1.3 tweenr_2.0.2 MASS_7.3-58.1
## [115] Matrix_1.5-1 cli_3.6.0 parallel_4.2.2
## [118] igraph_1.3.5 pkgconfig_2.0.3 sp_1.6-0
## [121] plotly_4.10.1 spatstat.sparse_3.0-0 ggtree_3.4.4
## [124] vipor_0.4.5 bslib_0.4.2 XVector_0.38.0
## [127] yulab.utils_0.0.6 digest_0.6.29 sctransform_0.3.5
## [130] RcppAnnoy_0.0.20 spatstat.data_3.0-1 Biostrings_2.64.1
## [133] rmarkdown_2.20 leiden_0.4.3 fastmatch_1.1-3
## [136] tidytree_0.4.2 uwot_0.1.14 shiny_1.7.4
## [139] aricode_1.0.1 lifecycle_1.0.3 nlme_3.1-160
## [142] jsonlite_1.8.4 viridisLite_0.4.1 fansi_1.0.3
## [145] pillar_1.8.1 lattice_0.20-45 ggrastr_1.0.1
## [148] KEGGREST_1.36.3 fastmap_1.1.0 httr_1.4.5
## [151] survival_3.4-0 GO.db_3.15.0 glue_1.6.2
## [154] png_0.1-8 bit_4.0.5 ggforce_0.4.1
## [157] stringi_1.7.8 sass_0.4.5 blob_1.2.3
## [160] textshaping_0.3.6 memoise_2.0.1 ape_5.7
## [163] irlba_2.3.5.1 future.apply_1.10.0